In [1]:
%matplotlib inline
In [2]:
import pandas as pd
import numpy as np
import csv
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA as sklearnPCA
from plotnine import *
mRNA-Seq from 10 individual C.elegans worms. Processed with CEL-Seq-pipeline (https://github.com/eco32i/CEL-Seq-pipeline)
In [3]:
!head ../data/CE_exp.umi.tab
In [4]:
!tail ../data/CE_exp.umi.tab
In [5]:
ce = pd.read_csv('../data/CE_exp.umi.tab', sep='\t', skipfooter=5)
ce
Out[5]:
PCA is sensitive to variable scaling. Therefore before performing the analysis we need to normalize the data. StandardScaler will transform every variable to unti space (mean 0, variance 1). Note also that sklearn expects columns to be genes (features) and rows to be worms (samples, or observations). Therefore we transpose the matrix before doing anything.
In [ ]:
#ce = ce.ix[ce.ix[:,1:].mean(axis=1)>500,:]
In [6]:
X_std = StandardScaler().fit_transform(ce.ix[:,1:].values.T)
X_std
Out[6]:
In [7]:
sklearn_pca = sklearnPCA(n_components=10)
Y_sklearn = sklearn_pca.fit_transform(X_std)
Y_sklearn
Out[7]:
Y_sklearn is a numpy array of the shape (num_samples, n_components) where original X data is projected onto the number of extracted principal components
In [8]:
sklearn_pca.explained_variance_
Out[8]:
In [9]:
sklearn_pca.explained_variance_ratio_
Out[9]:
In [10]:
vdf = pd.DataFrame()
vdf['PC'] = [(i+1) for i,x in enumerate(sklearn_pca.explained_variance_ratio_)]
vdf['var'] = sklearn_pca.explained_variance_ratio_
(ggplot(vdf, aes(x='PC', y='var'))
+ geom_point(size=5)
+ ylab('Explained variance')
+ theme(figure_size=(12,10))
)
Out[10]:
In [11]:
pca_df = pd.DataFrame()
pca_df['sample'] = ['CE_%i' % (x+1) for x in range(10)]
pca_df['PC1'] = Y_sklearn[:,0]
pca_df['PC2'] = Y_sklearn[:,1]
(ggplot(pca_df, aes(x='PC1', y='PC2', color='sample'))
+ geom_point(size=5)
+ theme(figure_size=(12,10))
)
Out[11]:
In [13]:
pca_df = pd.DataFrame()
pca_df['sample'] = ['CE_%i' % (x+1) for x in range(10)]
pca_df['PC1'] = Y_sklearn[:,0]
pca_df['PC3'] = Y_sklearn[:,2]
(ggplot(pca_df, aes(x='PC1', y='PC3', color='sample'))
+ geom_point(size=5)
+ theme(figure_size=(12,10))
)
Out[13]:
In [20]:
pca_df = pd.DataFrame()
pca_df['sample'] = ['CE_%i' % (x+1) for x in range(10)]
pca_df['PCA2'] = Y_sklearn[:,1]
pca_df['PCA4'] = Y_sklearn[:,3]
g = ggplot(pca_df, aes(x='PCA2', y='PCA4', color='sample')) \
+ geom_point(size=10)
print(g)
In [ ]: